In [1]:
# Computations
import pandas as pd
import numpy as np

# sklearn
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
from sklearn.metrics import f1_score, precision_score, recall_score, roc_curve, auc
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV, cross_val_score, KFold
from sklearn.feature_selection import RFE
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

# preprocessing
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer

# Text
from colorama import Fore, Back, Style
import re

# Visualisation libraries
import seaborn as sns
import matplotlib.pyplot as plt
from plotly.offline import init_notebook_mode, iplot 
import plotly.graph_objs as go

# Graphics in retina format 
%config InlineBackend.figure_format = 'retina' 

# sns setting
sns.set_context("paper", rc={"font.size":12,"axes.titlesize":14,"axes.labelsize":12})
sns.set_style("whitegrid")

# plt setting
plt.style.use('seaborn-whitegrid')
plt.rcParams['axes.labelsize'] = 14
plt.rcParams['xtick.labelsize'] = 12
plt.rcParams['ytick.labelsize'] = 12
plt.rcParams['text.color'] = 'k'
%matplotlib inline
import warnings
warnings.filterwarnings("ignore")

Daily Weather Data Analysis and Classification using Feature Importance

In this article, we analyze a weather dataset from Kaggle.com.

Table of Contents

Daily Weather Datase

Data description from Kaggle:

  • Sensor measurements from the weather station were captured at one-minute intervals. These measurements were then processed to generate values to describe daily weather. Since this dataset was created to classify
  • low-humidity days vs. non-low-humidity days (that is, days with normal or high humidity), the variables included are weather measurements in the morning, with one measurement, nAMely relatively humidity, in the afternoon. The idea is to use the morning weather values to predict whether the day will be low-humidity or not based on the afternoon measurement of relative humidity.
  • Each row, or sAMple, consists of the following variables:
    • Number: unique number for each row
    • air_pressure_9am: air pressure averaged over a period from 8:55 AM to 9:04 AM (Unit: hectopascals)
    • air_temp_9am: air temperature averaged over a period from 8:55 AM to 9:04 AM (Unit: degrees Fahrenheit)
  • air_wind_direction_9am: wind direction averaged over a period from 8:55AM to 9:04AM (Unit: degrees,
    • with 0 means coming from the North, and increasing clockwise)
  • air_wind_speed_9am: wind speed averaged over a period from 8:55AM to 9:04AM (Unit: miles per hour) max_wind_direction_9am: wind gust direction averaged over a period from 8:55AM to 9:10AM (Unit:
    • degrees, with 0 being North and increasing clockwise)
  • max_wind_speed_9am: wind gust speed averaged over a period from 8:55 AM to 9:04 AM (Unit: miles per hour)
  • rain_accumulation_9am: the AMount of rain accumulated in the 24 hours before 9 AM (Unit: millimeters)
  • rain_duration_9am: the AMount of time rain was recorded in the 24 hours before 9 AM (Unit: seconds)
  • relative_humidity_9am: relative humidity averaged over a period from 8:55 AM to 9:04 AM (Unit: percent)
  • relative_humidity_3pm: relative humidity averaged over a period from 2:55 PM to 3:04 PM (Unit: percent )

Loading the Dataset

In [2]:
Data = pd.read_csv('weatherdata/daily_weather.csv')
Data.drop(columns = ['number'], inplace = True)
Data.head().style.hide_index().set_precision(2)
Out[2]:
air_pressure_9am air_temp_9am avg_wind_direction_9am avg_wind_speed_9am max_wind_direction_9am max_wind_speed_9am rain_accumulation_9am rain_duration_9am relative_humidity_9am relative_humidity_3pm
918.06 74.82 271.10 2.08 295.40 2.86 0.00 0.00 42.42 36.16
917.35 71.40 101.94 2.44 140.47 3.53 0.00 0.00 24.33 19.43
923.04 60.64 51.00 17.07 63.70 22.10 0.00 20.00 8.90 14.46
920.50 70.14 198.83 4.34 211.20 5.19 0.00 0.00 12.19 12.74
921.16 44.29 277.80 1.86 136.50 2.86 8.90 14730.00 92.41 76.74

Features

Columns Description
Air Pressure Air pressure StartFragment in hectopascal (100 pascals) at 9 AM
Air Temperature Air temperature in degrees Fahrenheit at 9 AM
Avg Wind Direction Average wind direction over the minute before the timestamp in degrees (0 starts from the north) at 9 AM
Avg Wind Speed Average wind speed over the minute before the timestamp in meter per seconds (m/s) at 9 AM
Max Wind Direction Highest wind direction in the minute before the timestamp in degrees (0 starts from the north) at 9 AM
Max Wind Speed Highest wind speed in the minute before the timestamp in meter per seconds (m/s) at 9 AM
Min Wind Speed Smallest wind speed in the minute before the timestamp in meter per seconds (m/s) at 9 AM
Rain Accumulation Accumulated rain in millimeters (mm) at 9 AM
Rain Duration Length of time rain in seconds (s) at 9 AM
Relative Humidity (Morning) Relative humidity in percentage in at 9 AM
Relative Humidity (Afternoon) Relative humidity in percentage at 3 PM

For convenience, we would like to modify the feature names.

In [3]:
Data.columns = [x.replace('ty_9am','ty_(Morning)').replace('3pm', '(Afternoon)').replace('_9am', '').replace('_',
                                                      ' ').title().replace('Temp','Temperature') for x in Data.columns.tolist()]
Data.head(5).style.hide_index().set_precision(2)
Out[3]:
Air Pressure Air Temperature Avg Wind Direction Avg Wind Speed Max Wind Direction Max Wind Speed Rain Accumulation Rain Duration Relative Humidity (Morning) Relative Humidity (Afternoon)
918.06 74.82 271.10 2.08 295.40 2.86 0.00 0.00 42.42 36.16
917.35 71.40 101.94 2.44 140.47 3.53 0.00 0.00 24.33 19.43
923.04 60.64 51.00 17.07 63.70 22.10 0.00 20.00 8.90 14.46
920.50 70.14 198.83 4.34 211.20 5.19 0.00 0.00 12.19 12.74
921.16 44.29 277.80 1.86 136.50 2.86 8.90 14730.00 92.41 76.74

Preprocessing

Imputing Missing Values

Note that

In [4]:
def Data_info(Inp, Only_NaN = False):
    Out = Inp.dtypes.to_frame(name='Data Type').sort_values(by=['Data Type'])
    Out = Out.join(Inp.isnull().sum().to_frame(name = 'Number of NaN Values'), how='outer')
    Out['Percentage'] = np.round(100*(Out['Number of NaN Values']/Inp.shape[0]),2)
    if Only_NaN:
        Out = Out.loc[Out['Number of NaN Values']>0]
    return Out
    
Temp = Data_info(Data, Only_NaN = True)
display(Temp)
Temp = Temp.index.tolist()
Data Type Number of NaN Values Percentage
Air Pressure float64 3 0.27
Air Temperature float64 5 0.46
Avg Wind Direction float64 4 0.37
Avg Wind Speed float64 3 0.27
Max Wind Direction float64 3 0.27
Max Wind Speed float64 4 0.37
Rain Accumulation float64 6 0.55
Rain Duration float64 3 0.27
In [5]:
imp = SimpleImputer(missing_values=np.nan, strategy='mean')
Data[Temp] = imp.fit_transform(Data[Temp])
Data_info(Data)
Out[5]:
Data Type Number of NaN Values Percentage
Air Pressure float64 0 0.0
Air Temperature float64 0 0.0
Avg Wind Direction float64 0 0.0
Avg Wind Speed float64 0 0.0
Max Wind Direction float64 0 0.0
Max Wind Speed float64 0 0.0
Rain Accumulation float64 0 0.0
Rain Duration float64 0 0.0
Relative Humidity (Morning) float64 0 0.0
Relative Humidity (Afternoon) float64 0 0.0

Problem Description

Let's set Relative Humidity (Afternoon) as the target variable. This means given the dataset and using the rest of the features, we would like to know whether is humid or not at 3 PM. In doing so, we can consider the median of Relative Humidity (Afternoon). Then, assign 1 to values over or equal the median value, and 0 to values under the median value.

In [6]:
Median = Data['Relative Humidity (Afternoon)'].median()
Temp = Data['Relative Humidity (Afternoon)']> Median
Temp = Temp.astype(int)
Target = 'Relative Humidity (Afternoon)'

Modeling

First off, let's look at the variance of our dataset features.

In [7]:
display(Data.iloc[:,:-1].var().sort_values(ascending = False).to_frame(name= 'Variance')\
        .style.background_gradient(cmap='OrRd').set_precision(2))
Variance
Rain Duration 2546852.52
Avg Wind Direction 4762.57
Max Wind Direction 4508.55
Relative Humidity (Morning) 648.83
Air Temperature 124.32
Max Wind Speed 31.23
Avg Wind Speed 20.67
Air Pressure 10.11
Rain Accumulation 2.53

Furthermore, we would like to standardize features by removing the mean and scaling to unit variance. In this article, we demonstrated the benefits of scaling data using StandardScaler().

In [8]:
df = Data.copy()
df['Relative Humidity (Afternoon)'] = Temp
scaler = StandardScaler()
df.iloc[:,:-1] = scaler.fit_transform(df.iloc[:,:-1])

display(df.iloc[:,:-1].var().sort_values(ascending = False).to_frame(name= 'Variance')\
       .style.background_gradient(cmap=sns.light_palette("green", as_cmap=True)).set_precision(2))
Variance
Rain Duration 1.00
Air Temperature 1.00
Avg Wind Speed 1.00
Max Wind Speed 1.00
Max Wind Direction 1.00
Avg Wind Direction 1.00
Relative Humidity (Morning) 1.00
Air Pressure 1.00
Rain Accumulation 1.00
In [9]:
df.describe().style.set_precision(2)
Out[9]:
Air Pressure Air Temperature Avg Wind Direction Avg Wind Speed Max Wind Direction Max Wind Speed Rain Accumulation Rain Duration Relative Humidity (Morning) Relative Humidity (Afternoon)
count 1095.00 1095.00 1095.00 1095.00 1095.00 1095.00 1095.00 1095.00 1095.00 1095.00
mean 0.00 -0.00 -0.00 -0.00 0.00 -0.00 0.00 -0.00 0.00 0.50
std 1.00 1.00 1.00 1.00 1.00 1.00 1.00 1.00 1.00 0.50
min -3.43 -2.53 -1.84 -1.06 -1.79 -1.04 -0.13 -0.18 -1.11 0.00
25% -0.73 -0.68 -1.10 -0.72 -1.08 -0.71 -0.13 -0.18 -0.75 0.00
50% 0.01 0.07 0.34 -0.36 0.42 -0.37 -0.13 -0.18 -0.43 0.00
75% 0.72 0.76 0.71 0.40 0.78 0.33 -0.13 -0.18 0.44 1.00
max 3.28 3.05 2.92 3.97 2.43 4.09 14.99 10.91 2.29 1.00
In [10]:
X = df.drop(columns = [Target])
y = df[Target]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
pd.DataFrame(data={'Set':['X_train','X_test','y_train','y_test'],
               'Shape':[X_train.shape, X_test.shape, y_train.shape, y_test.shape]}).set_index('Set').T
Out[10]:
Set X_train X_test y_train y_test
Shape (766, 9) (329, 9) (766,) (329,)

A number of functions that we would use.

In [11]:
def Performance(clf, X_test):
    df = pd.DataFrame()
    y_pred = clf.predict(X_test)
    df = df.append({'Classifier': re.sub(r"(\w)([A-Z])", r"\1 \2",re.findall('[^()]+', str(clf))[0]),
                    'Score': clf.score(X_test, y_test),
                    'F1 Score': f1_score(y_test.values, y_pred, average= 'weighted'),
                    'Precision Score': precision_score(y_test.values, y_pred, average= 'weighted'),
                    'Recall Score':  recall_score(y_test.values, y_pred, average= 'weighted')}, ignore_index=True)
    display(df.style.hide_index().set_precision(2))
    return df

def highlight_max(s):
    is_max = s == s.max()
    return ['background-color: SpringGreen' if v else '' for v in is_max]


def Feature_Ranking(clf):
    df = pd.DataFrame()
    for n in range(2, X.shape[1]):
        selector = RFE(estimator= clf, n_features_to_select=n, verbose=0)
        selector.fit(X_train, y_train)
        df = df.append({'Number of Features to Select': n,
                        'Score': accuracy_score(y_test, selector.predict(X_test)),
                        'Features': X.columns[selector.support_].tolist(),
                        'Best Features':X.columns[selector.ranking_ == 1].tolist()}, ignore_index=True)

    df = df[['Number of Features to Select', 'Score', 'Features', 'Best Features']]
    df['Number of Features to Select'] = df['Number of Features to Select'].astype(int)
    df['Score'] = df['Score'].round(2)
    display(df.style.apply(highlight_max, subset=['Score']))
    return df.loc[df.Score == df.Score.max(), 'Features'].values[0]

def ROC_Curve(clf, X_test):
    # false positive rates, true positive rates and thresholds
    fpr, tpr, threshold = roc_curve(y_test, clf.predict_proba(X_test)[:,1])

    fig, ax = plt.subplots(1, 1, figsize=(5.5, 5.5))
    _ = ax.plot(fpr, tpr, lw=2, label = 'AUC = %0.2f' % auc(fpr, tpr))
    _ = ax.plot([0, 1], [0, 1],'r--', lw=2)
    _ = ax.legend(loc = 'lower right', fontsize = 14)
    delta =0.01
    _ = ax.set_xlim([-delta,1+delta])
    _ = ax.set_ylim([-delta,1+delta])
    _ = ax.set_xlabel('False Positive Rate (FPR)')
    _ = ax.set_ylabel('True Positive Rate (TPR)')
    return pd.DataFrame({'false positive rates':fpr, 'true positive rates':tpr, 'thresholds':threshold})

def Confusion_Matrix_Plot(clf, X_train, y_train, X_test, y_test, Labels):
    # Train set
    y_pred = clf.predict(X_train)
    Confusion_Matrix = confusion_matrix(y_train, y_pred)

    fig, ax = plt.subplots(1, 2, figsize=(15, 5))
    fig.suptitle('Train Set', fontsize = 18)
    _ = sns.heatmap(Confusion_Matrix, annot=True, annot_kws={"size": 14}, cmap="Blues", ax = ax[0],
                   linewidths = 0.2, cbar_kws={"shrink": 1})
    _ = ax[0].set_xlabel('Predicted labels')
    _ = ax[0].set_ylabel('True labels'); 
    _ = ax[0].set_title('Confusion Matrix');
    _ = ax[0].xaxis.set_ticklabels(Labels)
    _ = ax[0].yaxis.set_ticklabels(Labels)

    Confusion_Matrix = Confusion_Matrix.astype('float') / Confusion_Matrix.sum(axis=1)[:, np.newaxis]
    _ = sns.heatmap(Confusion_Matrix, annot=True, annot_kws={"size": 14}, cmap="Greens", ax = ax[1],
                   linewidths = 0.2, vmin=0, vmax=1, cbar_kws={"shrink": 1})
    _ = ax[1].set_xlabel('Predicted labels')
    _ = ax[1].set_ylabel('True labels'); 
    _ = ax[1].set_title('Normalized Confusion Matrix');
    _ = ax[1].xaxis.set_ticklabels(Labels)
    _ = ax[1].yaxis.set_ticklabels(Labels)

    # Test set
    y_pred = clf.predict(X_test)
    Confusion_Matrix = confusion_matrix(y_test, y_pred)

    fig, ax = plt.subplots(1, 2, figsize=(15, 5))
    fig.suptitle('Test Set', fontsize = 18)
    _ = sns.heatmap(Confusion_Matrix, annot=True, annot_kws={"size": 14}, cmap="Blues", ax = ax[0],
                    linewidths = 0.2, cbar_kws={"shrink": 1})
    _ = ax[0].set_xlabel('Predicted labels')
    _ = ax[0].set_ylabel('True labels'); 
    _ = ax[0].set_title('Confusion Matrix');
    _ = ax[0].xaxis.set_ticklabels(Labels)
    _ = ax[0].yaxis.set_ticklabels(Labels)

    Confusion_Matrix = Confusion_Matrix.astype('float') / Confusion_Matrix.sum(axis=1)[:, np.newaxis]
    _ = sns.heatmap(Confusion_Matrix, annot=True, annot_kws={"size": 14}, cmap="Greens", ax = ax[1],
                   linewidths = 0.2, vmin=0, vmax=1, cbar_kws={"shrink": 1})
    _ = ax[1].set_xlabel('Predicted labels')
    _ = ax[1].set_ylabel('True labels'); 
    _ = ax[1].set_title('Normalized Confusion Matrix');
    _ = ax[1].xaxis.set_ticklabels(Labels)
    _ = ax[1].yaxis.set_ticklabels(Labels)

DecisionTreeClassifier

First, let's try scikit-learn Decision Tree Classifier.

In [12]:
dtc = DecisionTreeClassifier()
_ = dtc.fit(X_train, y_train)
Performance_dtc = Performance(dtc, X_test)
ROC_dtc = ROC_Curve(dtc, X_test)
Classifier F1 Score Precision Score Recall Score Score
Decision Tree Classifier 0.88 0.88 0.88 0.88
In [13]:
Best_Features_dtc = Feature_Ranking(dtc)
print(Back.BLACK + Fore.CYAN + Style.NORMAL + 'Best Features:')
print(Style.RESET_ALL)
print('%s' % ', '.join(Best_Features_dtc))
Number of Features to Select Score Features Best Features
0 2 0.820000 ['Air Pressure', 'Relative Humidity (Morning)'] ['Air Pressure', 'Relative Humidity (Morning)']
1 3 0.870000 ['Air Pressure', 'Air Temperature', 'Relative Humidity (Morning)'] ['Air Pressure', 'Air Temperature', 'Relative Humidity (Morning)']
2 4 0.850000 ['Air Pressure', 'Air Temperature', 'Max Wind Direction', 'Relative Humidity (Morning)'] ['Air Pressure', 'Air Temperature', 'Max Wind Direction', 'Relative Humidity (Morning)']
3 5 0.860000 ['Air Pressure', 'Air Temperature', 'Max Wind Direction', 'Max Wind Speed', 'Relative Humidity (Morning)'] ['Air Pressure', 'Air Temperature', 'Max Wind Direction', 'Max Wind Speed', 'Relative Humidity (Morning)']
4 6 0.870000 ['Air Pressure', 'Air Temperature', 'Avg Wind Direction', 'Max Wind Direction', 'Max Wind Speed', 'Relative Humidity (Morning)'] ['Air Pressure', 'Air Temperature', 'Avg Wind Direction', 'Max Wind Direction', 'Max Wind Speed', 'Relative Humidity (Morning)']
5 7 0.870000 ['Air Pressure', 'Air Temperature', 'Avg Wind Direction', 'Avg Wind Speed', 'Max Wind Direction', 'Max Wind Speed', 'Relative Humidity (Morning)'] ['Air Pressure', 'Air Temperature', 'Avg Wind Direction', 'Avg Wind Speed', 'Max Wind Direction', 'Max Wind Speed', 'Relative Humidity (Morning)']
6 8 0.880000 ['Air Pressure', 'Air Temperature', 'Avg Wind Direction', 'Avg Wind Speed', 'Max Wind Direction', 'Max Wind Speed', 'Rain Accumulation', 'Relative Humidity (Morning)'] ['Air Pressure', 'Air Temperature', 'Avg Wind Direction', 'Avg Wind Speed', 'Max Wind Direction', 'Max Wind Speed', 'Rain Accumulation', 'Relative Humidity (Morning)']
Best Features:

Air Pressure, Air Temperature, Avg Wind Direction, Avg Wind Speed, Max Wind Direction, Max Wind Speed, Rain Accumulation, Relative Humidity (Morning)
In [14]:
dtc = DecisionTreeClassifier()
_ = dtc.fit(X_train[Best_Features_dtc], y_train)
Performance_dtc = Performance(dtc, X_test[Best_Features_dtc])
ROC_dtc = ROC_Curve(dtc, X_test[Best_Features_dtc])

Confusion_Matrix_Plot(dtc, X_train[Best_Features_dtc], y_train,
                      X_test[Best_Features_dtc], y_test, Labels = ['Not Humid','Humid'])
Classifier F1 Score Precision Score Recall Score Score
Decision Tree Classifier 0.88 0.88 0.88 0.88

Random Forest Classifier

Next, let's use scikit-learn Random Forest Classifier.

In [15]:
rfc = RandomForestClassifier()
_ = rfc.fit(X_train, y_train)
Performance_rfc = Performance(rfc, X_test)
ROC_rfc = ROC_Curve(rfc, X_test)
Classifier F1 Score Precision Score Recall Score Score
Random Forest Classifier 0.92 0.92 0.92 0.92
In [16]:
Best_Features = Feature_Ranking(rfc)
print(Back.BLACK + Fore.CYAN + Style.NORMAL + 'Best Features:')
print(Style.RESET_ALL)
print('%s' % ', '.join(Best_Features))
Number of Features to Select Score Features Best Features
0 2 0.880000 ['Air Pressure', 'Relative Humidity (Morning)'] ['Air Pressure', 'Relative Humidity (Morning)']
1 3 0.900000 ['Air Pressure', 'Max Wind Direction', 'Relative Humidity (Morning)'] ['Air Pressure', 'Max Wind Direction', 'Relative Humidity (Morning)']
2 4 0.900000 ['Air Pressure', 'Air Temperature', 'Max Wind Direction', 'Relative Humidity (Morning)'] ['Air Pressure', 'Air Temperature', 'Max Wind Direction', 'Relative Humidity (Morning)']
3 5 0.920000 ['Air Pressure', 'Air Temperature', 'Avg Wind Direction', 'Max Wind Direction', 'Relative Humidity (Morning)'] ['Air Pressure', 'Air Temperature', 'Avg Wind Direction', 'Max Wind Direction', 'Relative Humidity (Morning)']
4 6 0.920000 ['Air Pressure', 'Air Temperature', 'Avg Wind Direction', 'Max Wind Direction', 'Max Wind Speed', 'Relative Humidity (Morning)'] ['Air Pressure', 'Air Temperature', 'Avg Wind Direction', 'Max Wind Direction', 'Max Wind Speed', 'Relative Humidity (Morning)']
5 7 0.920000 ['Air Pressure', 'Air Temperature', 'Avg Wind Direction', 'Avg Wind Speed', 'Max Wind Direction', 'Max Wind Speed', 'Relative Humidity (Morning)'] ['Air Pressure', 'Air Temperature', 'Avg Wind Direction', 'Avg Wind Speed', 'Max Wind Direction', 'Max Wind Speed', 'Relative Humidity (Morning)']
6 8 0.910000 ['Air Pressure', 'Air Temperature', 'Avg Wind Direction', 'Avg Wind Speed', 'Max Wind Direction', 'Max Wind Speed', 'Rain Duration', 'Relative Humidity (Morning)'] ['Air Pressure', 'Air Temperature', 'Avg Wind Direction', 'Avg Wind Speed', 'Max Wind Direction', 'Max Wind Speed', 'Rain Duration', 'Relative Humidity (Morning)']
Best Features:

Air Pressure, Air Temperature, Avg Wind Direction, Max Wind Direction, Relative Humidity (Morning)
In [17]:
rfc = RandomForestClassifier()
_ = rfc.fit(X_train[Best_Features], y_train)
Performance_rfc = Performance(rfc, X_test[Best_Features])
ROC_rfc = ROC_Curve(rfc, X_test[Best_Features])


Confusion_Matrix_Plot(dtc, X_train[Best_Features_dtc], y_train,
                      X_test[Best_Features_dtc], y_test, Labels = ['Not Humid','Humid'])
Classifier F1 Score Precision Score Recall Score Score
Random Forest Classifier 0.91 0.91 0.91 0.91

Final Thoughts

In [18]:
pd.concat([Performance_dtc, Performance_rfc], ignore_index=True).style.hide_index().\
                                                                    background_gradient(cmap='Greens').set_precision(2)
Out[18]:
Classifier F1 Score Precision Score Recall Score Score
Decision Tree Classifier 0.88 0.88 0.88 0.88
Random Forest Classifier 0.91 0.91 0.91 0.91
In [19]:
fig = go.Figure()
fig.add_trace(go.Scatter(x= ROC_dtc['false positive rates'].values, y= ROC_dtc['true positive rates'].values,
                         line=dict(color='Blue', width= 1.5), 
                         name = '<b>Decision Tree</b>'))
fig.add_trace(go.Scatter(x= ROC_rfc['false positive rates'].values, y= ROC_rfc['true positive rates'].values,
                         line=dict(color='Green', width= 1.5), 
                         name = '<b>Random Forest</b>'))
fig.add_trace(go.Scatter(x= [0,1], y= [0,1], showlegend = False, line=dict(color='black', width=2, dash='dot')))
fig.update_layout(legend_title='Classifiers')
fig.update_layout(legend=dict(y=0.5, traceorder='reversed', font_size=12))
fig.update_layout(dragmode='select', plot_bgcolor= 'white',
                  width=610, height=500, hovermode='closest')
delta = 1e-2
fig.update_xaxes(showgrid=True, gridwidth=1, gridcolor='Lightgray', range=[-delta, 1+delta],
                showline=True, linewidth=1, linecolor='Lightgray', mirror=True,
                zeroline=True, zerolinewidth=1, zerolinecolor='Lightgray')
fig.update_yaxes(showgrid=True, gridwidth=1, gridcolor='Lightgray', range=[-delta, 1+delta],
                showline=True, linewidth=1, linecolor='Lightgray', mirror=True,
                zeroline=True, zerolinewidth=1, zerolinecolor='Lightgray')
fig.update_layout(title={'text': 'ROC Curves',
                         'x':0.43, 'y':0.85,
                         'xanchor': 'center', 'yanchor': 'top'})
fig.show()

We can see the area under the curve for Random Forest Classifier is better, therefore, this classifier performs the best here.